db_1 <- 
  "~/Downloads/tmp2/202001_202212_02_mx_youtube_data.db"

db_2 <- 
  "~/Downloads/tmp2/202301_202312_02_mx_youtube_data.db"

db_3 <- 
  "~/Downloads/tmp2/202401_202405_02_mx_youtube_data.db"

db_4 <-
  "~/Downloads/tmp2/202405_02_mx_youtube_data.db"

library(DBI)
library(RSQLite)

comments_2020_24 <- 
  data.frame()

videos_2020_24 <-
  data.frame()


for (db in c(db_1, db_2, db_3, db_4)) {
  
  print(db)
  
  # Connect to database
  con <- dbConnect(RSQLite::SQLite(), db)
  
  # Read comments
  comments <- dbGetQuery(con, 
                         "SELECT comment_id, video_id, published_at FROM comments")
  
  comments_2020_24 <- 
    rbind(comments_2020_24, 
          comments)
  
  # Read video searches
  video_searches <- dbGetQuery(con, 
                               "SELECT video_id, publishedAt, latitude, longitude FROM video_searches")
  
  videos_2020_24 <- 
    rbind(videos_2020_24, 
          video_searches)
  
  # Close connection
  dbDisconnect(con)
  
}

library(tidyverse)
library(lubridate)

videos_2020_24$datetime <- 
  ymd_hms(videos_2020_24$publishedAt, tz = "UTC")

videos_2020_24$datetime_mx <- 
  with_tz(videos_2020_24$datetime, tzone = "America/Mexico_City")

comments_2020_24$datetime <- 
  ymd_hms(comments_2020_24$published_at, tz = "UTC")

comments_2020_24$datetime_mx <- 
  with_tz(comments_2020_24$datetime, tzone = "America/Mexico_City")

comments_wt_coordinates <- 
  videos_2020_24 |>
  left_join(comments_2020_24, by = "video_id")


video_coords_count <- 
  videos_2020_24 |>
  dplyr::distinct(video_id, latitude, longitude) |>
  dplyr::mutate(latitude = as.numeric(latitude), 
                longitude = as.numeric(longitude)) |>
  dplyr::group_by(latitude, longitude) |>
  dplyr::count() 

video_duplication <-
  videos_2020_24 |>
  dplyr::group_by(video_id) |>
  dplyr::count() 

breaks <- c(1, 15, 50, 100, 200, 300, 600, 1000, 1500, Inf)
labels <- c("1-15", "15-50", "50-100", "100-200", "200-300", 
            "300-600", "600-1000", "1000-1500", "1500+")

video_coords_count$breaks <- 
  cut(video_coords_count$n, 
      breaks = breaks,
      labels = labels,
      include.lowest = TRUE,
      right = FALSE)


comments_coords_count <- 
  comments_wt_coordinates |>
  dplyr::mutate(latitude = as.numeric(latitude), 
                longitude = as.numeric(longitude)) |>
  dplyr::group_by(latitude, longitude) |>
  dplyr::count()

breaks <- c(1, 10, 50, 200, 500, 1500, 5000, 25000, 100000, Inf)
labels <- c("1-10", "10-50", "50-200", "200-500", "500-1500", 
            "1500-5000", "5000-25000", "25000-100000", "100000+")

comments_coords_count$breaks <- 
  cut(comments_coords_count$n, 
    breaks = breaks,
    labels = labels,
    include.lowest = TRUE)



#### Visualisation #### 
library(cowplot)

videos_by_day <- 
  videos_2020_24 %>%
  mutate(date = date(datetime_mx)) %>%
  group_by(date) %>%
  count()

videos_by_24h <- 
  videos_2020_24 %>%
  mutate(time_hhmm = format(datetime_mx, "%H:%M")) %>%
  group_by(time_hhmm) %>%
  count() %>%
  mutate(time_posix = as.POSIXct(paste("1970-01-01", time_hhmm)))

comments_by_24h <- 
  comments_2020_24 %>%
  mutate(time_hhmm = format(datetime_mx, "%H:%M")) %>%
  group_by(time_hhmm) %>%
  count() %>%
  mutate(time_posix = as.POSIXct(paste("1970-01-01", time_hhmm)))

comments_by_day <- 
  comments_2020_24 %>%
  mutate(date = date(datetime_mx)) %>%
  group_by(date) %>%
  count()

require(raster)

r <- 
  raster("../data/mexico/mx-query-setup/pop-grid/mx_pd_2020_1km_Aggregated.tif")
r <- 
  aggregate(r, fact = 10, fun = sum)
r_df <- 
  as.data.frame(rasterToPoints(r))
names(r_df) <- 
  c("x", "y", "value")
r_df <- 
  r_df %>% 
  mutate(across(c(x, y), round, digits = 4))
r_df <- 
  r_df %>% 
  mutate(include = value>100) 

r_df <- 
  dplyr::left_join(r_df, 
                   video_coords_count %>%
                     dplyr::select(latitude, longitude, n_videos = n),
                   by = c(x = "longitude", y = "latitude"))

r_df <- 
  dplyr::left_join(r_df, 
                   comments_coords_count %>%
                     dplyr::select(latitude, longitude, n_comments = n),
                   by = c(x = "longitude", y = "latitude"))

library(scales)
r_df <- 
  r_df %>%
  mutate(value_cut = cut_number(value, n = 9))

label_mapping <- c(
  "[0,0.687]" = "0-1",
  "(0.687,5]" = "1-5",
  "(5,22.6]" = "5-23",
  "(22.6,88.1]" = "23-88", 
  "(88.1,261]" = "88-261",
  "(261,684]" = "261-684",
  "(684,1.83e+03]" = "684-1,830",
  "(1.83e+03,5.82e+03]" = "1,830-5,820", 
  "(5.82e+03,2.41e+06]" = "5,820-2,410,000"
)

r_df$clean_labels <- r_df$value_cut
levels(r_df$clean_labels) <-label_mapping

r_df$clean_labels <-
  recode(as.character(r_df$value_cut), !!!label_mapping)


cowplot_by_day_ts <- 
  cowplot::plot_grid(
    
    ggplot(videos_by_day, 
           aes(x = date, y = n)) +
      geom_line() +
      scale_x_date(
        date_labels = "%b %Y",           # Format: Jan 2020
        date_breaks = "3 months",        # Break every 3 months
        limits = c(as.Date("2020-01-01"), ("2024-06-01"))) +
      labs(y = "n. videos", x = NULL),
    
    ggplot(comments_by_day, 
           aes(x = date, y = n)) +
      geom_line() +
      scale_x_date(
        date_labels = "%b %Y",           # Format: Jan 2020
        date_breaks = "3 months",        # Break every 3 months
        limits = c(as.Date("2020-01-01"), ("2024-06-01"))) + 
      labs(y = "n. comments", x = NULL),
    
    nrow = 2
  )

ggplot(comments_by_day, 
       aes(x = date, y = n)) +
  geom_line() +
  geom_point() +
  scale_x_date(
    date_labels = "%b %Y",           # Format: Jan 2020
    date_breaks = "3 months",        # Break every 3 months
    limits = c(as.Date("2024-01-01"), ("2024-06-01"))) + 
  labs(y = "n. comments", x = NULL) + 
  geom_vline(xintercept = as.Date("2024-06-01"), alpha = .5)


cowplot_by_24h <-
  cowplot::plot_grid(
    
    ggplot(videos_by_24h, aes(x = time_posix, y = n)) + geom_line() +
      scale_y_log10() +
      scale_x_datetime(
        date_labels = "%H:%M",
        date_breaks = "2 hours"
      ) +
      annotation_logticks() +
      labs(x = NULL, y = "n. videos"),
    
    ggplot(comments_by_24h, aes(x = time_posix, y = n)) + geom_line() +
      scale_y_log10() +
      scale_x_datetime(
        date_labels = "%H:%M",
        date_breaks = "2 hours"
      ) +
      annotation_logticks() +
      labs(x = "Mexico City time", y = "n. comments"),
    
    
    nrow = 2
  )


cowplot_grid_pop <- 
  cowplot::plot_grid(
    
    ggplot(r_df, aes(x = value, 
                     y = n_videos)) +
      geom_point(alpha = .5, size = .8) +
      scale_x_log10(limits = c(100, NA),
                    labels = label_comma()) + 
      scale_y_log10(labels = label_comma()) +
      annotation_logticks() +
      labs(x = NULL, y = NULL),
    
    ggplot(r_df, aes(x = value, 
                     y = n_comments)) +
      geom_point(alpha = .5, size = .8) +
      scale_x_log10(limits = c(100, NA),
                    labels = label_comma()) + 
      scale_y_log10(labels = label_comma()) +
      annotation_logticks() +
      labs(x = "population", y = NULL),
    
    nrow = 2
  )


bottom_row <- 
  cowplot::plot_grid(cowplot_by_24h, 
                     cowplot_grid_pop,
                     ncol = 2)




ggsave(filename = "youtube_data_sum_stats.jpg", width = 14, height = 10,
       
       cowplot::plot_grid(cowplot_by_day_ts, 
                          bottom_row,
                          nrow = 2,
                          rel_heights = c(.7, 1))
       
)


ggsave(filename = "geo_videos_comments.jpg", width = 11, height =12,
       
       plot_grid(
         
         ggplot(r_df, aes(x=x, y=y)) + 
           geom_tile(aes(fill = clean_labels)) +
           scale_fill_brewer(palette = "Spectral", direction = 1) +
           theme_void() +
           labs(
             title = "Population 2020",
             fill = NULL,
           ) +
           guides(fill = guide_legend(nrow = 2)) +
           theme(legend.position = "bottom",
                 plot.background = element_rect(fill = "white", color = NA),
                 panel.background = element_rect(fill = "white", color = NA)),
         
         plot_grid(
           
           video_coords_count |>
             ggplot(aes(longitude, y=latitude)) +
             geom_tile(aes(fill = breaks)) +
             scale_fill_brewer(palette = "YlOrBr", direction = 1) +
             theme_void() +
             labs(
               title = "Geolocated videos 2020-24 (n = 3,398,143)",
               fill = NULL,
             ) +
             guides(fill = guide_legend(nrow = 2)) +
             theme(legend.position = "bottom",
                   plot.background = element_rect(fill = "white", color = NA),
                   panel.background = element_rect(fill = "white", color = NA)),
           
           comments_coords_count |>
             ggplot(aes(longitude, y=latitude)) +
             geom_tile(aes(fill = breaks)) +
             scale_fill_brewer(palette = "YlGnBu", direction = 1) + 
             theme_void() +
             labs(
               title = "Geolocated comments 2020-24 (n = 43,900,491)",
               fill = NULL,
             ) +
             guides(fill = guide_legend(nrow = 2)) +
             theme(legend.position = "bottom",
                   plot.background = element_rect(fill = "white", color = NA),
                   panel.background = element_rect(fill = "white", color = NA)),
           
           nrow = 1
         ),
         
         nrow = 2, rel_heights = c(1, .5)
         
       )
       
)


#### Timeline ####

comments_score <- 
  readRDS("/Volumes/PRJ-youtube_api/mexico/processed-data/comment-scores/comments_score.rds")


library(ggplot2)
library(dplyr)
library(lubridate)

# Prepare data for plotting
daily_range <- comments_score %>%
  mutate(
    date = as.Date(posix)  # Extract date from datetime
  ) %>%
  group_by(date) %>%
  summarise(
    mean_score = mean(scalar_sum, na.rm = TRUE),
    sd_score = sd(scalar_sum, na.rm = TRUE),
    count = n(),
    .groups = "drop"
  )

period_range <-
  daily_range %>%
  summarise(
    median = median(mean_score, na.rm = TRUE),
    quant75 = quantile(mean_score, .75, na.rm = TRUE),
    quant25 = quantile(mean_score, .25, na.rm = TRUE)
  )



base <- 
  ggplot(daily_range, aes(x = date)) +
  geom_rect(aes(xmin = as.Date("2020-01-01"), xmax = as.Date("2024-06-01"), 
                ymin = period_range$quant25[1], 
                ymax = period_range$quant75[1]), 
            fill = "lightblue", alpha = 0.5) +
  geom_line(aes(y = mean_score), size = .4, alpha = .5) +
  labs(
    title = "Average Daily Comment Scores",
    y = NULL, y = NULL,
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  scale_y_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = label_comma() 
  ) +
  # Vertical lines
  # geom_vline(xintercept = as.Date("2020-03-10"), linetype = "dashed", color = "red", alpha = 0.7) +
  # geom_vline(xintercept = as.Date("2023-01-05"), linetype = "dashed", color = "red", alpha = 0.7) +
  geom_vline(xintercept = as.Date("2024-06-02"), linetype = "dashed", color = "red", alpha = 0.7) +
  # Text annotations
  annotate("text", y = 0.09, x = as.Date("2020-06-10"), 
           label = "March 2020\nmassive protests against violence") +
  annotate("text", y = 0.07, x = as.Date("2023-01-05"),
           label = "January 2023\nSinaloa unrest following Ovidio Guzmán arrest") +
  annotate("text", y = 0.05, x = as.Date("2024-06-02"),
           label = "2024\nGeneral Elections") +
  geom_hline(yintercept = period_range$median[1]) +
  labs(x = NULL, y = NULL) + 
  annotate("rect", 
           ymin = 0.016, ymax = 0.032, 
           xmin = as.Date("2024-01-01"), xmax = as.Date("2024-06-01"),
           fill = "red", alpha = 0.2, color = "red") +
  annotate("text", 
           x = as.Date("2024-03-15"),  # Center of the box
           y = 0.05,  # Just above the box (ymax = 0.1, so 0.12)
           label = "2024 Federal Election\nCampaign")
  


base2 <- 
  ggplot(daily_range %>%
           dplyr::filter(date > as.Date("2024-01-01") &
                           date < as.Date("2024-06-03")), aes(x = date)) +
  geom_rect(aes(xmin = as.Date("2024-01-01"), xmax = as.Date("2024-06-01"), 
                ymin = period_range$quant25[1], 
                ymax = period_range$quant75[1]), 
            fill = "lightblue", alpha = 0.5) +
  geom_line(aes(y = mean_score), size = .4, alpha = .5) +
  labs(
    title = "2024 Federal election campaign",
    y = NULL, y = NULL,
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  # Vertical lines
  geom_hline(yintercept = period_range$median[1]) +
  labs(x = NULL, y = NULL)



ggsave(filename = "comment_score_ts.jpg", width = 12, height = 10,
       
       cowplot::plot_grid(
         
       base +
         scale_x_date(
           limits = c(as.Date("2020-01-01"), as.Date("2024-06-01")),
           date_breaks = "6 months",
           date_labels = "%Y-%m"),
       
       base2 +
         scale_x_date(
           limits = c(as.Date("2024-01-01"), as.Date("2024-06-01")),
           date_breaks = "1 month",
           date_labels = "%Y-%m"),
       
       nrow = 2, rel_heights = c(1,1.2)
       
       )
)

#### Score by State ####

folder_path <- "/Volumes/PRJ-youtube_api/mexico/processed-data/idw-daily"

# Get all CSV files
csv_files <- list.files(folder_path, pattern = "\\.csv$", full.names = TRUE)

# Read and combine
csv_list <- lapply(csv_files, read.csv)
combined_data <- do.call(rbind, csv_list)

priogrid_wt_pop <-
  read.csv("/Volumes/PRJ-youtube_api/mexico/processed-data/priogrid_wt_pop/priogrid_mex_wt_pop_2020.csv")

combined_data$pop <- 
  priogrid_wt_pop$population[match(combined_data$gid, 
                                   priogrid_wt_pop$gid)]


library(sf)

priogrid.sf <- 
  read_sf("/Volumes/PRJ-youtube_api/mexico/secondary-data/priogrid_mex/priogrid_mex.shp")

mx_state.sf <- 
  read_sf("/Volumes/PRJ-youtube_api/mexico/secondary-data/mex-state-shape/mexican-states.shp")

grid_centroids <- st_centroid(priogrid.sf)

res <- st_join(grid_centroids, mx_state.sf, join = st_nearest_feature)

priogrid.sf$state <- 
  res$`ISO3166-2`

combined_data$state <- 
  priogrid.sf$state[match(combined_data$gid,
                          priogrid.sf$gid)]

vpi_by_state_year <- 
  combined_data %>%
  dplyr::mutate(year = as.numeric(substr(group, 1, 4))) %>%
  dplyr::filter(!is.na(pop)) %>%
  dplyr::group_by(state, year) %>%
  dplyr::summarise(weigh_mean_vpi = median(var1.pred))


mexico_peace_index_2020 <- 
  read.csv("~/Downloads/mexico_peace_index_2020.csv")

mexico_peace_index_2020$STATE
mx_state.sf$`ISO3166-2`[]

ggplot(vpi_by_state_year %>%
         dplyr::filter(year >= 2020),
       aes(x = year, y = weigh_mean_vpi, label = state, group = state)) +
  geom_text() +
  geom_line()
  
library(fuzzyjoin)
library(stringdist)

mx_state.df <- 
  data.frame(iso = mx_state.sf$`ISO3166-2`,
             name = mx_state.sf$name)

result <- 
  mexico_peace_index_2020  |>
  fuzzy_left_join(mx_state.df, by = c("STATE" = "name"),
                  match_fun = function(x, y) stringdist(x, y, method = "jw") < 0.05)

result$iso[result$STATE == "Coahuila"] <- "MX-COA"
result$iso[result$STATE == "Veracruz"] <- "MX-VER"
result$iso[result$STATE == "Mexico City"] <- "MX-MEX"
result$iso[result$STATE == "Michoacán"] <- "MX-MIC"

result <-
  result %>%
  left_join(vpi_by_state_year %>%
              dplyr::filter(year == 2020), by = c(iso = "state"))

ggplot(result, aes(x = weigh_mean_vpi, 
                   y = OVERALL_SCORE)) + 
  geom_point() + 
  geom_smooth(method = "lm")


#### Comment length ####

library(tidyverse)
library(data.table)

df <- fread("~/Downloads/comments_2020_24_wt_wc.csv", select = c("comment_id", "wc"))

comments_score <- readRDS("~/Downloads/tmp2/comments_score.rds")

df$wc <- as.numeric(df$wc)

# write.csv(df, "~/Downloads/comments_2020_24_wt_wc.csv", row.names = FALSE)


quantile(df$wc, p = c(.5, .95, .99))

df$scalar_sum <- 
  comments_score$scalar_sum[match(df$comment_id, 
                                  comments_score$comment_id)]

df$scalar_pos <- 
  df$scalar_sum > 0

ggsave(filename = "comment_word_count.jpg",  width = 8, height = 4,
       ggplot(df, aes(x = wc, y = scalar_pos)) + 
         geom_boxplot()  +
         scale_x_continuous(trans = "log10") +
         annotation_logticks(sides = "b") +
         theme_bw() + labs(x = "comment length (n. of words)",
                           y = "comment violence score is positive"))

View(df %>%
       group_by(scalar_pos) %>%
       summarise(
         n = n(),
         prop = n() / nrow(df),
         zero_length = sum(wc == 0),
         mean = mean(wc, na.rm = TRUE),
         median = median(wc, na.rm = TRUE),
         sd = sd(wc, na.rm = TRUE),
         min = min(wc, na.rm = TRUE),
         max = max(wc, na.rm = TRUE)
       ))

df_pos <- 
  df[df$scalar_pos,]

cor.test(log10(df_pos$wc), log10(df_pos$scalar_sum))

library(ggExtra)

p <- 
  ggplot(df_pos, aes(x = wc, y = scalar_sum)) +
  geom_jitter(width = 0.25, height = 0.25, alpha = .5) +
  scale_x_continuous(trans = "log10") +
  scale_y_continuous(trans = "log10") +
  stat_smooth(se = FALSE) +
  annotation_logticks(sides = "bl") +
  labs(y = "comment violence score", x = "comment length (n. of words)") +
  theme_bw()


ggsave(filename = "comment_word_count_corr_scalar_sum.jpg",  width = 8, height = 4,
       ggMarginal(p, type="histogram")
)

t0 <- 
  cor.test(
    df_pos$scalar_sum,
    df_pos$wc)

t1 <- 
  cor.test(
    df_pos$scalar_sum[df_pos$wc<25],
    df_pos$wc[df_pos$wc<25])

t2 <- 
  cor.test(
    df_pos$scalar_sum[df_pos$wc<50],
    df_pos$wc[df_pos$wc<50])

t3 <-
  cor.test(
    df_pos$scalar_sum[df_pos$wc<100],
    df_pos$wc[df_pos$wc<100])

t4 <- 
  cor.test(
    df_pos$scalar_sum[df_pos$wc>=100],
    df_pos$wc[df_pos$wc>=100])

View(
  data.frame(
    wc = c("Any", "< 25", "< 50", "< 100", "100+"),
    cor = c(t0$estimate, t1$estimate, t2$estimate, t3$estimate, t4$estimate),
    perc =  as.character(c("100", 
                           round(sum(df_pos$wc<25)/nrow(df_pos)*100,2), 
                           round(sum(df_pos$wc<50)/nrow(df_pos)*100,2),
                           round(sum(df_pos$wc<100)/nrow(df_pos)*100,2),
                           round(sum(df_pos$wc>=100)/nrow(df_pos)*100,2)))))

